library(stm)
library(irlba)
library(stringr)
library(stringi)
library(CCA)
library(reshape)

## note: github.com/wilryh/parrot has updated code for this method
## one difference: this code relies on shell command (uses AWK), package uses R only

in_path <- ""
out_path <- ""
glove_dir <- ""

setwd(in_path)


options(stringsAsFactors=F)

cat("\nReading..\n\n")
data2016 <- read.csv("full_2016.csv.gz", encoding="UTF-8")[,c("Date","Pub","Text")]
data2015 <- read.csv("full_2015.csv.gz", encoding="UTF-8")[,-1]
names(data2015) <- c("Date","Pub","Text")

alldata <- rbind(data2016, data2015)

subdata <- subset(alldata, grepl("muslim", Text, ignore.case=T))
subdata$Text <- stri_trans_general(subdata$Text, "latin-ascii")
subdata$Text <- strsplit(subdata$Text, ".", fixed=T)

findMuslimText <- function(x) {
    ind <- c(
        unique(
            c(
                grep("muslim", x)-1,
                grep("muslim", x),
                grep("muslim", x)+1
            )
        )
    )
    ind <- ind[ind > 0 & ind <= length(x)]
    paste(x[ind], collapse=" ")
    }

subdata$Text <- sapply(
    subdata$Text, findMuslimText
)

count.muslim.bydate <- table(sub(",", "", subdata$Date))

save(
    count.muslim.bydate,
    file=paste0(out_path, "transcript_count_muslim_mentions_by_date.RData")
    )

processed <- textProcessor(
    subdata$Text, subdata,
    wordLengths=c(2,Inf),removestopwords=T,
    lowercase=T,
    stem=F
)
out <- prepDocuments(processed$documents, processed$vocab, processed$meta)

doc.to.tdm <- function(documents, vocab){
        tdm <- matrix(0,nrow=length(documents),
                      ncol=length(vocab))
        for(i in 1:length(documents)){
            tdm[i,documents[[i]][1,]] <-
                documents[[i]][2,]
        }
        return(tdm)
    }

cat("\nMaking TDM..\n\n")

d2 <- melt(lapply(out$documents, function(x) x[1,]))
d2 <- data.frame(
    melt(lapply(out$documents, function(x) x[1,])),
    count=melt(lapply(out$documents, function(x) x[2,]))[,1]
)
tdm <- sparseMatrix(as.numeric(d2[,2]), d2[,1], x=d2[,3])

cat("\nPrep embeddings..\n\n")
tdm.orig <- tdm

write.table(
    out$vocab,
    paste0(out_path, "muslim_ban_transcript_words.csv"),
    row.names=F, quote=F, col.names=FALSE
)

## find relevant words in embedding file
## in this version of the method, the word embeddings have little effect
wikipedia.emb <- system(paste0("awk 'FNR==NR{a[ $1 ]; next
    }($1 in a)
    ' ",out_path,"muslim_ban_transcript_words.csv ",glove_dir,"glove.6B.300d.txt"
  ), intern=T)
wikipedia.emb <- read.table(
    textConnection(wikipedia.emb),
    col.names=c("word",paste("W", 1:300, sep=""))
)

twitter.emb <- system(paste0("awk 'FNR==NR{a[ $1 ]; next
    }($1 in a)
    ' ",out_path,"muslim_ban_transcript_words.csv ",glove_dir,"glove.twitter.27B.200d.txt"
  ), intern=T)
twitter.emb <- read.table(
    textConnection(twitter.emb),
    col.names=c("word",paste("T", 1:200, sep=""))
)

emb.words <- intersect(wikipedia.emb$word, twitter.emb$word)

## meta data
out.sub.vocab <- out$vocab[
         out$vocab %in% emb.words
           & !(out$vocab %in% c(
                  "international","byline","section","length","words","aired",
                  "guests","domestic","news","et","anchor"
                )
                           )]

tdm <- tdm[,out$vocab %in% emb.words
           & !(out$vocab %in% c(
                  "international","byline","section","length","words","aired",
                  "guests","domestic","news","et","anchor")
                )]
tdm <- tdm[rowSums(tdm) > 0,]

cat("\nPrep PCA..\n\n")
## method not designed for multiple counts, can create instability
tdm[tdm > 1] <- 1

for.pca <- crossprod(tdm)

cat("\nPCA..\n\n")
pca <- prcomp(
    sweep(for.pca, 1, diag(for.pca), `/`)
)

wikipedia.emb <- subset(wikipedia.emb, word %in% emb.words)
wikipedia.emb.sorted <- wikipedia.emb[match(out.sub.vocab, wikipedia.emb$word),]

twitter.emb <- subset(twitter.emb, word %in% emb.words)
twitter.emb.sorted <- twitter.emb[match(out.sub.vocab, twitter.emb$word),]

thesub <- rep(TRUE, length(out.sub.vocab))

out.sub.vocab.n <- diag(for.pca)[thesub]

out.sub.vocab.orig <- out.sub.vocab

out.sub.vocab <- out.sub.vocab[thesub]

pcax <- pca$x[thesub,1:10]

thesums <- colMeans(sweep(for.pca, 1, diag(for.pca), `/`))[thesub]

w.by.w.l2 <- sweep(for.pca, 1, diag(for.pca), `/`)

## defaults
a <- 1
b <- 2                                  #higher is better, 2 is fine
k <- 1

cat("\nCCA..\n\n")
thecc <- rcc(
    pca$x[thesub,1:200],
    cc(
        wikipedia.emb.sorted[,-1],
        twitter.emb.sorted[,-1]
        )$scores$xscores[thesub,]^a * diag(for.pca)[thesub]^b,
    lambda1=k * pca$sdev[1]^2,
    lambda2=0                           #important that this is 0
)

cat("\nSaving..\n\n")

save(
    pcax, w.by.w.l2, thesums,
    out.sub.vocab, out.sub.vocab.n,
    thecc,
    file=paste0(
        out_path,
        "muslim_mentions_in_transcripts_coefs.RData"
    )
)
